Data Introduction

Parks and recreation was a television comedy show that aired on NBC from 2009 until 2015. I obtained the complete transcripts and performed text analysis on the dialogue of the show. Link to data.

file_names <- list.files(here("scripts")) # file names for each episode

parks <- str_glue("scripts/{file_names}") %>% 
  map_dfr(read_csv) # read in all the episodes into one data frame!

# Tokenize lines to one word in each row
parks_token <- parks %>% 
  clean_names() %>% 
  unnest_tokens(word, line) %>% # tokenize
  anti_join(stop_words) %>% # remove stop words
  mutate(word = str_extract(word, "[a-z']+")) %>% # extract words only
  drop_na(word) # take out missing values

# Filter the top 10 characters with the most words
top_characters <- parks_token %>%
  dplyr::filter(character != "Extra") %>% 
  count(character, sort = TRUE) %>%
  slice_max(n, n = 10) 

# Obtain words only from the top 10 characters
parks_words <- parks_token %>% 
  inner_join(top_characters) %>% 
  filter(!word %in% c("hey", "yeah", "gonna")) %>% 
  select(-n) %>% 
  count(word, character, sort = TRUE) %>% 
  ungroup() %>% 
  group_by(character) %>% 
  top_n(9) # top 9

# Sample of a few lines from the show
parks %>% 
  slice(sample(1:65942, 20)) %>% 
  kbl(caption = "<b style = 'color:white;'>
      Sample of a few randomly chosen lines from Parks and Recreation") %>%
  kable_material_dark(bootstrap_options = c("striped", "hover")) %>%
  scroll_box(width = "100%", height = "300px", 
             fixed_thead = list(enabled = T, background = "#222222"))
Sample of a few randomly chosen lines from Parks and Recreation
Character Line
Ben Wyatt Can you actually do this?
Leslie Knope You’re never going to be able to pull it off.
Leslie Knope Yes.
Leslie Knope Well, they couldn’t make a banner in time for my press conference, so I am cobbling together a banner from all the other discarded banners.
Ann Perkins We’re moving to a new city.
Andy Dwyer I want nickels.
Ben Wyatt And I didn’t really like it.
Tom Haverford I’m good at this stuff.
Donna Meagle One for pleasure, and one to cap off a long con I was running against Keith Sweat.
Chance Frenlm Topless park!
Leslie Knope I’m gonna call Kaboom, and check on their availability for next year.
Ben Wyatt Now, in this town it’s bigger than the super bowl.
Ann Perkins What the hell happened?
Ben Wyatt Oh.
Tom Haverford Excuse me!
Donna Meagle When did you make the switch to boxer briefs?
Linda Lonegan My Uncle Lon once tried to
Leslie Knope Did I hit your boob?
Leslie Knope So first up, meet and greet.
Leslie Knope He doesn’t know anybody in town.

Word Count of Major Characters

ggplot(data = parks_words, 
       aes(x = n, y = word, fill = n)) +
  geom_col(show.legend = FALSE) +
  scale_fill_viridis_c(option = "plasma") +
  facet_wrap(~character, scales = "free") +
  theme_brooklyn99() +
  theme(panel.grid.major.y = element_blank(),
        axis.text.x = element_text(size = 9),
        axis.text.y = element_text(size = 10),
        axis.title = element_blank(),
        panel.grid.minor = element_blank(),
        strip.text = element_text(color = "white",
                                  face = "bold",
                                  size = 10.5))

Wordcloud

# Ron Swanson

swanson_words <- parks_token %>% 
  filter(character == "Ron Swanson") %>% # filter for character
  filter(!word %in% c("hey", "yeah", "gonna")) %>% # remove some more stopwords
  count(word) %>% 
  slice_max(n,n = 25) # choose top 25 words
  
swanson_pic <- jpeg::readJPEG(here("images","ron_swanson.jpg")) 

swanson_cloud <- ggplot(data = swanson_words,
                        aes(label = word)) +
  background_image(swanson_pic) + # add image of character
  geom_text_wordcloud(aes(size = n), 
                      color = "turquoise1",
                      shape = "circle") +
  scale_size_area(max_size = 6) +
  theme_void()

# Lesile Knope

knope_words <- parks_token %>% 
  filter(character == "Leslie Knope") %>% 
  filter(!word %in% c("hey", "yeah", "gonna")) %>% # remove some more stopwords
  count(word) %>% 
  slice_max(n,n = 25)
  
knope_pic <- jpeg::readJPEG(here("images", "knope.jpg"))

knope_cloud <- ggplot(data = knope_words,
                        aes(label = word)) +
  background_image(knope_pic) +
  geom_text_wordcloud(aes(size = n), 
                      color = "turquoise1",
                      shape = "star") +
  scale_size_area(max_size = 6) +
  theme_void()

# April Ludgate

april_words <- parks_token %>% 
  filter(character == "April Ludgate") %>% 
  filter(!word %in% c("hey", "yeah", "gonna")) %>% # remove some more stopwords
  count(word) %>% 
  slice_max(n,n = 25)
  
april_pic <- jpeg::readJPEG(here("images", "april.jpeg"))

april_cloud <- ggplot(data = april_words,
                        aes(label = word)) +
  background_image(april_pic) +
  geom_text_wordcloud(aes(size = n), 
                      color = "turquoise1",
                      shape = "triangle-upright") +
  scale_size_area(max_size = 6) +
  theme_void()

# Andy Dwyer

andy_words <- parks_token %>% 
  filter(character == "Andy Dwyer") %>% 
  filter(!word %in% c("hey", "yeah", "gonna")) %>% # remove some more stopwords
  count(word) %>% 
  slice_max(n,n = 25)
  
andy_pic <- jpeg::readJPEG(here("images", "andy.jpg"))

andy_cloud <- ggplot(data = andy_words,
                        aes(label = word)) +
  background_image(andy_pic) +
  geom_text_wordcloud(aes(size = n), 
                      color = "turquoise1",
                      shape = "diamond") +
  scale_size_area(max_size = 6) +
  theme_void()

# Final patcwork wordcloud

patchwork <-  (andy_cloud + april_cloud) / (knope_cloud + swanson_cloud) 

patchwork & theme(plot.background = element_rect(fill = "#222222",
                                                 color = "#222222"),
                  strip.background = element_rect(fill = "#222222",
                                                 color = "#222222"))

Sentiment Analysis throughout entire show

parks <- str_glue("scripts/{file_names}") %>% 
  map_dfr(read_csv) # read in all the episodes into one data frame!

file_names_season <- str_sub(file_names, start = 3L)

# which(file_names_season == "e01.csv")

# season_1 <- str_glue("scripts/{file_names[1:6]}") %>% 
#   map_dfr(read_csv)
# 
# season_2 <- str_glue("scripts/{file_names[7:30]}") %>% 
#   map_dfr(read_csv)
# 
# season_3 <- str_glue("scripts/{file_names[31:46]}") %>% 
#   map_dfr(read_csv)

season_4 <- str_glue("scripts/{file_names[47:68]}") %>% 
  map_dfr(read_csv)

# season_5 <- str_glue("scripts/{file_names[69:90]}") %>% 
#   map_dfr(read_csv)
# 
# season_6 <- str_glue("scripts/{file_names[91:110]}") %>% 
#   map_dfr(read_csv)
# 
# season_7 <- str_glue("scripts/{file_names[111:length(file_names)]}") %>% 
#   map_dfr(read_csv)



parks_afinn <- parks_token %>% 
  inner_join(get_sentiments("afinn")) %>%
  drop_na(value) %>% 
  mutate(index = seq(1, length(word) ,1)) %>% 
  mutate(moving_avg = as.numeric(slide(value,
                                       mean, 
                                       .before = (151 - 1)/2 , 
                                       .after = (151 - 1)/2 ))) %>% 
  mutate(neg_pos = factor(case_when(
    moving_avg > 0 ~ "Positive",
    moving_avg <= 0 ~ "Negative"
  )))

sent_plot <- ggplot(data = parks_afinn, aes(x = index, y = moving_avg)) +
  geom_col(aes(fill = neg_pos)) +
  scale_fill_manual(values = c("Positive" = "springgreen2",
                               "Negative" = "darkred"))+
  theme_minimal() +
  labs(x = "Index",
       y = "Moving Average sentiment",
       fill = "") +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.major.x = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_text(size = 11,
                                   face = "bold",
                                   color = "white"),
        axis.title.y = element_text(color = "white",
                                  size = 12,
                                  face = "bold"),
        axis.title.x = element_blank(),
        panel.grid.minor = element_blank(),
        plot.background = element_rect(fill = "#222222", 
                                       color = "#222222"),
        strip.background = element_rect(fill = "#222222", 
                                        color = "#222222"),
        legend.text = element_text(color = "white",
                                  size = 11,
                                  face = "bold"))

# Make it interactive:
# ggplotly(sent_plot, tooltip = c("index", "character", "word", "value"))

sent_plot

Sentiment Anaylsis of an episode

# Tokenize lines to one word in each row
season_token <-  season_4 %>% 
  clean_names() %>% 
  unnest_tokens(word, line) %>% # tokenize
  anti_join(stop_words) %>% # remove stop words
  mutate(word = str_extract(word, "[a-z']+")) %>% # extract words only
  drop_na(word) # take out missing values

season_afinn <- season_token %>% 
  inner_join(get_sentiments("afinn")) %>%
  drop_na(value) %>% 
  mutate(index = seq(1, length(word) ,1)) %>% 
  mutate(moving_avg = as.numeric(slide(value,
                                       mean, 
                                       .before = (51 - 1)/2 , 
                                       .after = (51 - 1)/2 ))) 

season_plot <- ggplot(data = season_afinn, aes(x = index, y = moving_avg)) +
  geom_col(aes(fill = moving_avg)) +
  scale_fill_distiller(type = "div",
                       palette = "BrBG")+
  theme_minimal() +
  labs(x = "Index",
       y = "AFINN sentiment",
       fill = "") +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.major.x = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_text(size = 11,
                                   face = "bold",
                                   color = "white"),
        axis.title.y = element_text(color = "white",
                                  size = 12,
                                  face = "bold"),
        axis.title.x = element_blank(),
        panel.grid.minor = element_blank(),
        plot.background = element_rect(fill = "#222222", 
                                       color = "#222222"),
        strip.background = element_rect(fill = "#222222", 
                                        color = "#222222"),
        legend.text = element_text(color = "white",
                                  size = 11,
                                  face = "bold"))

# Make it interactive:
# ggplotly(sent_plot, tooltip = c("index", "character", "word", "value"))

season_plot

Sentiment Analysis for all the character

characters_sent <-  parks_token %>%
  inner_join(top_characters) %>%
  filter(!word %in% c("hey", "yeah", "gonna")) %>%
  select(-n) %>% 
  inner_join(get_sentiments("nrc")) %>% 
  count(sentiment, character, sort = TRUE)

ggplot(data = characters_sent, 
       aes(x = n, y = sentiment, fill = n)) +
  geom_col(show.legend = FALSE) +
  scale_fill_viridis_c(option = "plasma") +
  facet_wrap(~character, scales = "free") +
  theme_brooklyn99() +
  theme(panel.grid.major.y = element_blank(),
        axis.text.x = element_text(size = 9),
        axis.text.y = element_text(size = 10),
        axis.title = element_blank(),
        panel.grid.minor = element_blank(),
        strip.text = element_text(color = "white",
                                  face = "bold",
                                  size = 10.5))